Company wants to see overview of Instacart's business:
| Phase | Time | Resources | Risks |
|---|---|---|---|
| Business Understanding | 1 day | All analysts | Economic and market changes |
| Data Understanding | 1 day | All analysts | Data problems, technological problems |
| Data Preparation | 2 days | Data scientists, DB engineers | Data problems, technological problems |
| Modeling | 1 day | Data scientists | Technological problems, inability to build adequate model |
| Evaluation | 1 day | All analysts | Economic change inability to implement results |
| Deployment | 1 day | Data scientists, DB engineers, implementation team | Economic change inability to implement results |
departments.csv
order_products.csv
orders.csv
products.csv
# Data Processing
import numpy as np
import pandas as pd
# Apriori & Recommendation
import mlxtend as ml
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from sklearn.metrics.pairwise import cosine_similarity
#Visualization & Clustering
import seaborn as sns
color = sns.color_palette()
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure, text
import networkx as nx
import plotly.express as px
from sklearn.cluster import KMeans, AgglomerativeClustering
import plotly.graph_objs as go
# Utilities and others
import os
import warnings
warnings.filterwarnings("ignore")
print(nx.__version__)
print(matplotlib.__version__)
print(np.__version__)
# read a csv files from current directory into DataFrame and save it to a variables
orders = pd.read_csv(os.path.join(os.getcwd(),'orders.csv'))
products = pd.read_csv(os.path.join(os.getcwd(),'products.csv'))
order_products = pd.read_csv(os.path.join(os.getcwd(),'order_products.csv'))
departments = pd.read_csv(os.path.join(os.getcwd(),'departments.csv'))
# Check the number of unique orders and unique products
orders_Unique = len(set(order_products.order_id))
products_Unique = len(set(order_products.product_id))
print("There are %s orders for %s products" %(orders_Unique, products_Unique))
#customers
print("Number of unique customers in the whole dataset : ",len(set(orders.user_id)))
#Days of Orders in a week
grouped = orders.groupby("order_id")["order_dow"].aggregate("sum").reset_index()
grouped = grouped.order_dow.value_counts()
sns.set_style('darkgrid')
f, ax = plt.subplots(figsize=(15, 6))
sns.barplot(grouped.index, grouped.values, color = '#ff8200')
plt.ylabel('Number of orders', fontsize=13)
plt.xlabel('Days of order in a week', fontsize=13)
ax.set_facecolor('#fff0db')
plt.show()
#Hours of Order in a Day
grouped = orders.groupby("order_id")["order_hour_of_day"].aggregate("sum").reset_index()
grouped = grouped.order_hour_of_day.value_counts()
sns.set_style('darkgrid')
f, ax = plt.subplots(figsize=(15, 6))
sns.barplot(grouped.index, grouped.values, color = '#43b02a')
plt.ylabel('Number of orders', fontsize=13)
plt.xlabel('Hours of order in a day', fontsize=13)
ax.set_facecolor('#fff0db')
plt.show()
grouped_df = orders.groupby(["order_dow", "order_hour_of_day"])["order_number"].aggregate("count").reset_index()
grouped_df = grouped_df.pivot('order_dow', 'order_hour_of_day', 'order_number')
plt.figure(figsize=(12,6))
sns.heatmap(grouped_df, cmap="Greens")
plt.title("Frequency of Day of week Vs Hour of day")
plt.show()
#Period of Reorders
grouped = orders.groupby("order_id")["days_since_prior_order"].aggregate("sum").reset_index()
grouped = grouped.days_since_prior_order.value_counts()
from matplotlib.ticker import FormatStrFormatter
f, ax = plt.subplots(figsize=(15, 6))
sns.barplot(grouped.index, grouped.values, color = '#ff8200')
ax.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
plt.ylabel('Number of orders', fontsize=13)
plt.xlabel('Period of reorder', fontsize=13)
ax.set_facecolor('#fff0db')
plt.show()
# The chart indicates Weekly and Monthly orders are the most popular.
#Do people usually reorder the same previous ordered products ?
grouped = order_products.groupby("reordered")["product_id"].agg(Total = 'count').reset_index()
grouped['Ratios'] = round(grouped["Total"].apply(lambda x: x /grouped["Total"].sum())*100,2)
grouped
#59 % of ordered products are previously ordered by customers
Now that we have seen 59% of the products are re-ordered, there will also be situations when none of the products are re-ordered. Let us check that now.
grouped_df = order_products.groupby("order_id")["reordered"].agg("sum").reset_index()
grouped_df["reordered"].loc[grouped_df["reordered"]>1] = 1
grouped_df.reordered.value_counts() / grouped_df.shape[0]
About 12% of the orders has no re-ordered items
grouped = order_products.groupby("order_id")["add_to_cart_order"].agg("max").reset_index()
grouped = grouped.add_to_cart_order.value_counts()
sns.set_style('darkgrid')
f, ax = plt.subplots(figsize=(15, 8))
plt.xticks(rotation='vertical')
sns.barplot(grouped.index, grouped.values, color = '#43b02a')
plt.title("Number of products bought in each order", fontsize=15)
ax.set_facecolor('#fff0db')
plt.ylabel('Number of Orders', fontsize=13)
plt.xlabel('Number of products added in order', fontsize=13)
plt.show()
A right tailed distribution with the maximum value at 5
# Rank the top 10 best-selling items
grouped = order_products.groupby("product_id")["reordered"].agg(frequency_count = 'count').reset_index()
grouped = pd.merge(grouped, products[['product_id', 'product_name']], how='left', on=['product_id'])
percent = grouped.product_name.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
grouped = grouped.sort_values(by= 'frequency_count', ascending=False)[:10]
grouped[['product_name','frequency_count']].reset_index(drop=True)
grouped = grouped.groupby(['product_name']).sum()['frequency_count'].sort_values(ascending=False)
sns.set_style('darkgrid')
f, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation='vertical')
sns.barplot(grouped.index, grouped.values, color = '#ff8200')
plt.ylabel('Number of Reorders', fontsize=13)
plt.xlabel('Most ordered Products', fontsize=13)
ax.set_facecolor('#fff0db')
plt.show()
#Which products are usually reordered
grouped = order_products.groupby("product_id")["reordered"].agg(reorder_sum = 'sum', reorder_total = 'count').reset_index()
grouped['reorder_probability'] = grouped['reorder_sum'] / grouped['reorder_total']
grouped = pd.merge(grouped, products[['product_id', 'product_name']], how='left', on=['product_id'])
grouped = grouped[grouped.reorder_total > 75].sort_values(['reorder_probability'], ascending=False)[:10]
grouped
grouped = grouped.groupby(['product_name']).sum()['reorder_probability'].sort_values(ascending=False)
sns.set_style('darkgrid')
f, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation='vertical')
sns.barplot(grouped.index, grouped.values, color = '#43b02a')
plt.ylim([0.65,0.80])
plt.ylabel('Reorder probability', fontsize=13)
plt.xlabel('Most reordered products', fontsize=12)
ax.set_facecolor('#fff0db')
plt.show()
#obtaining user, order and product detailed info for prior set
user_order_products=pd.merge(orders,order_products, on='order_id',how='left')
user_order_products_all_details=pd.merge(user_order_products,products,on='product_id',how='left')
#extracting order_dow_hod_reord_count, the number of reorders for a particular day of the week and particular hour of the day
order_dow_hod_reord_count=user_order_products_all_details.groupby(['order_dow','order_hour_of_day']).agg({'reordered':sum}).reset_index()
order_dow_hod_reord_count.columns=['order_dow','order_hour_of_day','order_dow_hod_reord_count']
#extracting order_dow_hod_reord_prop, the proportion of reorders for a particular day of the week and particular hour of the day
order_dow_hod_reord_prop=pd.DataFrame(order_dow_hod_reord_count)
order_dow_hod_reord_prop['order_dow_hod_reord_cnt']=user_order_products_all_details.groupby(['order_dow','order_hour_of_day']).agg({'reordered':'count'}).reset_index().reordered
order_dow_hod_reord_prop['order_dow_hod_reord_prop']=order_dow_hod_reord_prop['order_dow_hod_reord_count']/order_dow_hod_reord_prop['order_dow_hod_reord_cnt']
order_dow_hod_reord_prop.drop(['order_dow_hod_reord_count','order_dow_hod_reord_cnt'],axis=1,inplace=True)
#Plotting heatmap illustrating reordering probability in order_dow, order_hour_of_day space
plt.figure(figsize=(15,6))
sns.heatmap(order_dow_hod_reord_count.pivot('order_dow','order_hour_of_day','order_dow_hod_reord_prop'),annot=True,cmap="Greens")
plt.title("Reorder ratio of Day of week Vs Hour of day")
plt.show()
From the above plot, we can observe that probability of reordering is more on 6st day of the week 5th hour of day, followed by 1st day of week 5th hour of day
# products bought distribution by department
combdf = pd.merge(order_products,products, on="product_id")
combdf = pd.merge(combdf,departments, on ='department_id')
plt.figure(figsize=(10,10))
temp_series = combdf['department'].value_counts()
labels = (np.array(temp_series.index))
sizes = (np.array((temp_series / temp_series.sum())*100))
plt.pie(sizes, labels=labels,
autopct='%1.1f%%', startangle=200)
plt.title("Departments distribution", fontsize=15)
plt.show()
#reorder ratio by department
grouped_df = combdf.groupby(["department"])["reordered"].aggregate("mean").reset_index()
grouped_df
plt.figure(figsize=(12,8))
sns.pointplot(grouped_df['department'].values, grouped_df['reordered'].values, alpha=0.8,color='#ff8200')
plt.ylabel('Reorder ratio', fontsize=12)
plt.xlabel('Department', fontsize=12)
plt.title("Department wise reorder ratio", fontsize=15)
plt.xticks(rotation='vertical')
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
grouped_df = combdf.groupby(["product_name"])["reordered"].aggregate("mean").reset_index()
grouped_df
plt.figure(figsize=(20,8))
sns.pointplot(grouped_df['product_name'].values, grouped_df['reordered'].values, alpha=0.8, color = '#43b02a')
plt.ylabel('Reorder ratio', fontsize=12)
plt.xlabel('Product', fontsize=12)
plt.title("Product wise reorder ratio", fontsize=15)
plt.xticks(rotation='vertical')
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
#values with low reorder rate are likely to be one-shot purchases for a lot of customers
grouped_df = user_order_products_all_details.groupby(["order_dow"])["reordered"].aggregate("mean").reset_index()
plt.figure(figsize=(12,8))
sns.barplot(grouped_df['order_dow'].values, grouped_df['reordered'].values, alpha=0.8, color ='#ff8200')
plt.ylabel('Reorder ratio', fontsize=12)
plt.xlabel('Day of week', fontsize=12)
plt.title("Reorder ratio across day of week", fontsize=15)
plt.xticks(rotation='vertical')
plt.ylim(0.5, 0.7)
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
grouped_df = user_order_products_all_details.groupby(["order_hour_of_day"])["reordered"].aggregate("mean").reset_index()
plt.figure(figsize=(12,8))
sns.barplot(grouped_df['order_hour_of_day'].values, grouped_df['reordered'].values, alpha=0.8, color = '#43b02a')
plt.ylabel('Reorder ratio', fontsize=12)
plt.xlabel('Hour of day', fontsize=12)
plt.title("Reorder ratio across hour of day", fontsize=15)
plt.xticks(rotation='vertical')
plt.ylim(0.5, 0.7)
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
#plotting pdf of order_number
my_pal = {"#ff8200",'#43b02a'}
sns.FacetGrid(user_order_products_all_details,hue='reordered',height=6, palette=my_pal).map(sns.distplot,'order_number',bins=20)\
.set(xticks=range(0,101,5))\
.add_legend()
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
#plotting boxplot of order_number
sns.boxplot(x='reordered',y='order_number',data=user_order_products_all_details, palette=my_pal)
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
From the above plot, we can observe that when the order number is less than 10 the non-reordered class dominates and when order number is greater than 10, the reordered class starts to dominate.
#plotting PDF of first 50 add_to_cart_order for better view
my_pal = {"#ff8200",'#43b02a'}
sns.FacetGrid(user_order_products_all_details[user_order_products_all_details.add_to_cart_order<=50],hue='reordered',height=6,palette=my_pal)\
.map(sns.distplot,'add_to_cart_order',bins=200)\
.add_legend()
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
#plotting boxplot of add_to_cart_order
sns.boxplot(x='reordered',y='add_to_cart_order',data=user_order_products_all_details,palette=my_pal).set(ylim=(0,80))
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
From the above plot we can observe that when add_to_cart_order is less than or equal to 5 , the number of reorders dominate while after 5, the number of non-reorders dominate.
# probability of reordering
my_pal = {"#ff8200",'#43b02a'}
temp = user_order_products_all_details.groupby(['add_to_cart_order','reordered'])
temp=temp.size().unstack()
temp =temp.head(30)
temp.plot(kind = "bar", rot = 90, stacked = False, color=my_pal)
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
#most reordered product
temp = user_order_products_all_details[user_order_products_all_details.reordered == 1]
temp = temp.product_name.value_counts()
temp = temp.head(30)
plt.figure(figsize=(15,8))
temp.plot(kind = "bar", rot = 90, stacked = False, color='#ff8200')
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
grouped = orders.groupby('user_id')['order_id'].apply(lambda x: len(x.unique())).reset_index()
grouped = grouped.groupby('order_id').aggregate("count")
sns.set_style("darkgrid")
f, ax = plt.subplots(figsize=(15, 6))
ax.set_facecolor('#fff0db')
sns.barplot(grouped.index, grouped.user_id, color = '#43b02a')
plt.ylabel('Numbers of Customers')
plt.xlabel('Number of Orders per customer')
plt.xticks(rotation='vertical')
plt.show()
items = pd.merge(left=products, right=departments, how='left')
grouped = items.groupby("department")["product_id"].agg(Total_products = 'count').reset_index()
grouped['Ratio'] = round(grouped["Total_products"].apply(lambda x: x /grouped['Total_products'].sum())*100,2)
grouped.sort_values(by='Total_products', ascending=False, inplace=True)
grouped.reset_index(drop=True)
grouped = grouped.groupby(['department']).sum()['Total_products'].sort_values(ascending=False)
sns.set_style("darkgrid")
f, ax = plt.subplots(figsize=(15, 6))
ax.set_facecolor('#fff0db')
plt.xticks(rotation='vertical')
sns.barplot(grouped.index, grouped.values,color='#ff8200')
plt.ylabel('Number of products', fontsize=13)
plt.xlabel('Departments', fontsize=13)
plt.show()
users_flow = orders[['user_id', 'order_id']].merge(order_products[['order_id', 'product_id']],
how='inner', left_on='order_id', right_on='order_id')
users_flow = users_flow.merge(items, how='inner', left_on='product_id',
right_on='product_id')
grouped = users_flow.groupby("department")["order_id"].agg( Total_orders = 'count').reset_index()
grouped['Ratio'] = round(grouped["Total_orders"].apply(lambda x: x /grouped['Total_orders'].sum())*100,2)
grouped.sort_values(by='Total_orders', ascending=False, inplace=True)
grouped.reset_index(drop=True)
grouped = grouped.groupby(['department']).sum()['Total_orders'].sort_values(ascending=False)
f, ax = plt.subplots(figsize=(15, 6))
ax.set_facecolor('#fff0db')
plt.xticks(rotation='vertical')
sns.barplot(grouped.index, grouped.values,color = '#43b02a')
plt.ylabel('Number of Orders', fontsize=13)
plt.xlabel('Departments', fontsize=13)
plt.show()
filtered = combdf[['order_id','department']]
ohe_filtered = pd.get_dummies(filtered)
corr = ohe_filtered.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(145, 300, s=60, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
#top-10 most ordering customers
temp = orders.merge(order_products, on='order_id')
pd.DataFrame(temp.groupby('user_id')['product_id'].count()).sort_values('product_id', ascending=False).head(10)
add_to_cart_order_reordered_ratio=order_products[order_products['add_to_cart_order']<30]
add_to_cart_order_reordered_ratio = add_to_cart_order_reordered_ratio.groupby('add_to_cart_order')['reordered'].mean().reset_index()
plt.figure(figsize=(25,8))
sns.pointplot(add_to_cart_order_reordered_ratio.add_to_cart_order, add_to_cart_order_reordered_ratio.reordered, color = '#ff8200' )
plt.title('Add To Cart Order vs. Reorder Ratio', fontsize=16)
plt.xlabel('Add To Cart Order', fontsize=16)
plt.xticks(fontsize=15)
plt.xticks(rotation='vertical')
plt.ylabel('Reorder Ratio', fontsize=16)
plt.yticks(fontsize=15);
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
# top 15 first choices
grouped = user_order_products_all_details[user_order_products_all_details['add_to_cart_order'] == 1].groupby(['product_name'])['order_id'].agg(['count']).sort_values(by='count',ascending=False)
grouped[:15]
# top 15 second choices
grouped = user_order_products_all_details[user_order_products_all_details['add_to_cart_order'] == 2].groupby(['product_name'])['order_id'].agg(['count']).sort_values(by='count',ascending=False)
grouped[:15]
# groupby product frequency and average position in cart
user_order_products_all_details.groupby(['product_name'])['add_to_cart_order'].agg(['count','mean']).sort_values(by='count',ascending=False)
user_order_products_all_details.groupby(['product_name'])['add_to_cart_order'].agg(['count','mean']).sort_values(by='mean',ascending=True).head(15)
user_order_products_all_details.groupby(['product_name'])['add_to_cart_order'].apply(lambda x: x.mode().iloc[0]).sort_values(ascending=True).head(15)
grouped = order_products.groupby("product_id")["reordered"].agg(frequency_count = 'count').reset_index()
grouped = pd.merge(grouped, products[['product_id', 'product_name']], how='left', on=['product_id'])
percent = grouped.product_name.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
grouped = grouped.sort_values(by= 'frequency_count', ascending=False)
grouped[['product_name','frequency_count']].reset_index(drop=True)
# Import packages
import matplotlib.pyplot as plt
%matplotlib inline
# Define a function to plot word cloud
def plot_cloud(wordcloud):
# Set figure size
plt.figure(figsize=(40, 30))
# Display image
plt.imshow(wordcloud)
# No axis details
plt.axis("off");
data = dict(zip(grouped['product_name'].tolist(), grouped['frequency_count'].tolist()))
# Import packages
from wordcloud import WordCloud
from PIL import Image
# Import image to np.array
mask = np.array(Image.open('basket.png'))
# Generate wordcloud
wordcloud = WordCloud(width = 2500, height = 1500, random_state=1, background_color='white', colormap='Set2', collocations=False, mask=mask).generate_from_frequencies(data)
# Plot
plot_cloud(wordcloud)
product_orders_by_hour = pd.DataFrame({'count': temp.groupby(['product_id', 'order_hour_of_day']).size()}).reset_index()
product_orders_by_hour['pct'] = product_orders_by_hour.groupby('product_id')['count'].apply(lambda x: x/x.sum()*100)
mean_hour = pd.DataFrame({'mean_hour': product_orders_by_hour.groupby('product_id').apply(lambda x: sum(x['order_hour_of_day'] * x['count'])/sum(x['count']))}).reset_index()
morning = mean_hour.sort_values('mean_hour')[:15]
morning = morning.merge(products, on='product_id')
morning
afternoon = mean_hour.sort_values('mean_hour', ascending=False)[:15]
afternoon = afternoon.merge(products, on='product_id')
afternoon
morning_pct = product_orders_by_hour.merge(morning, on='product_id').sort_values(['mean_hour', 'order_hour_of_day'])
afternoon_pct = product_orders_by_hour.merge(afternoon, on='product_id').sort_values(['mean_hour', 'order_hour_of_day'], ascending=False)
# get list of morning and afteroon product names
morning_product_names = list(morning_pct['product_name'].unique())
morning_product_names = '\n'.join(morning_product_names)
afternoon_product_names = list(afternoon_pct['product_name'].unique())
afternoon_product_names = '\n'.join(afternoon_product_names)
# Figure Size
fig, ax = plt.subplots(figsize=(16, 10))
# Plot
morning_pct.groupby('product_id').plot(x='order_hour_of_day',
y='pct',
ax=ax,
legend=False,
alpha=0.2,
aa=True,
color='#ff8200',
linewidth=1.5,)
afternoon_pct.groupby('product_id').plot(x='order_hour_of_day',
y='pct',
ax=ax,
legend=False,
alpha=0.2,
aa=True,
color = '#43b02a',
linewidth=1.5,)
# Aesthetics
# Margins
plt.margins(x=0.5, y=0.05)
# Hide spines
for spine in ax.spines.values():
spine.set_visible(False)
# Labels
label_font_size = 14
plt.xlabel('Hour of Day Ordered', fontsize=label_font_size)
plt.ylabel('Percent of Orders by Product', fontsize=label_font_size)
# Tick Range
tick_font_size = 12
ax.tick_params(labelsize=tick_font_size)
plt.xticks(range(0, 25, 2))
plt.yticks(range(0, 16, 5))
plt.xlim([-2, 28])
# Vertical line at noon
plt.vlines(x=12, ymin=0, ymax=15, alpha=0.5, color='gray', linestyle='dashed', linewidth=1.0)
# Text
text_font_size = 12
ax.text(0.01, 0.95, morning_product_names,
verticalalignment='top', horizontalalignment='left',
transform=ax.transAxes,
color='#ff8200', fontsize=text_font_size)
ax.text(0.99, 0.95, afternoon_product_names,
verticalalignment='top', horizontalalignment='right',
transform=ax.transAxes,
color='#43b02a', fontsize=text_font_size);
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
total = order_products.isnull().sum().sort_values(ascending=False)
percent = (order_products.isnull().sum()/order_products.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total Missing', 'Percent'])
missing_data
all_data = pd.merge(user_order_products_all_details, departments, on='department_id')
all_data.isnull().sum().sort_values(ascending=False)
# they are not missing - it's first order
order_products_cust = order_products.merge(products, on ='product_id', how='left')
order_products_cust = order_products_cust.merge(departments, on ='department_id', how='left')
order_products_cust = order_products_cust.merge(orders, on='order_id', how='left')
order_products_cust.shape
order_products_cust
cross_df = pd.crosstab(order_products_cust.user_id, order_products_cust.product_name)
df = cross_df.div(cross_df.sum(axis=1), axis=0)
df.head()
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
df_pca = pca.fit_transform(df)
df_pca = pd.DataFrame(df_pca)
df_pca.head()
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(df_pca)
Sum_of_squared_distances.append(km.inertia_)
plt.subplots(figsize = (8, 5))
plt.plot(K, Sum_of_squared_distances, 'bx-', color= '#43b02a')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
clusterer = KMeans(n_clusters=5,random_state=42).fit(df_pca)
centers = clusterer.cluster_centers_
c_preds = clusterer.predict(df_pca)
print(centers)
temp_df = df_pca.iloc[:, 0:2]
temp_df.columns = ["pc1", "pc2"]
temp_df['cluster'] = c_preds
fig, ax = plt.subplots(figsize = (8, 5))
ax = sns.scatterplot(data = temp_df, x = "pc1", y = "pc2", hue = "cluster")
ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")
ax.set_title("Cluster Visualization")
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
# top products per cluster
cross_df['cluster'] = c_preds
cluster1 = cross_df[cross_df.cluster == 0]
cluster2 = cross_df[cross_df.cluster == 1]
cluster3 = cross_df[cross_df.cluster == 2]
cluster4 = cross_df[cross_df.cluster == 3]
cluster5 = cross_df[cross_df.cluster == 4]
cluster1.shape
cluster1.drop('cluster',axis=1).mean().sort_values(ascending=False)[0:10]
cluster2.shape
cluster2.drop('cluster',axis=1).mean().sort_values(ascending=False)[0:10]
cluster3.shape
cluster3.drop('cluster',axis=1).mean().sort_values(ascending=False)[0:10]
cluster4.shape
cluster4.drop('cluster',axis=1).mean().sort_values(ascending=False)[0:10]
cluster5.shape
cluster5.drop('cluster',axis=1).mean().sort_values(ascending=False)[0:10]
c1 = cluster1.drop('cluster',axis=1).mean().sort_values(ascending=False)
c2 = cluster2.drop('cluster',axis=1).mean().sort_values(ascending=False)
c3 = cluster3.drop('cluster',axis=1).mean().sort_values(ascending=False)
c4 = cluster4.drop('cluster',axis=1).mean().sort_values(ascending=False)
c5 = cluster5.drop('cluster',axis=1).mean().sort_values(ascending=False)
from IPython.display import display, HTML
cluster_means = [[c1['fresh fruits'],c1['fresh vegetables'],c1['packaged vegetables fruits'], c1['yogurt'], c1['packaged cheese'], c1['milk'],c1['water seltzer sparkling water'],c1['chips pretzels']],
[c2['fresh fruits'],c2['fresh vegetables'],c2['packaged vegetables fruits'], c2['yogurt'], c2['packaged cheese'], c2['milk'],c2['water seltzer sparkling water'],c2['chips pretzels']],
[c3['fresh fruits'],c3['fresh vegetables'],c3['packaged vegetables fruits'], c3['yogurt'], c3['packaged cheese'], c3['milk'],c3['water seltzer sparkling water'],c3['chips pretzels']],
[c4['fresh fruits'],c4['fresh vegetables'],c4['packaged vegetables fruits'], c4['yogurt'], c4['packaged cheese'], c4['milk'],c4['water seltzer sparkling water'],c4['chips pretzels']],
[c5['fresh fruits'],c5['fresh vegetables'],c5['packaged vegetables fruits'], c5['yogurt'], c5['packaged cheese'], c5['milk'],c5['water seltzer sparkling water'],c5['chips pretzels']]]
cluster_means = pd.DataFrame(cluster_means, columns = ['fresh fruits','fresh vegetables','packaged vegetables fruits','yogurt','packaged cheese','milk','water seltzer sparkling water','chips pretzels'])
HTML(cluster_means.to_html())
cluster_perc = cluster_means.iloc[:, :].apply(lambda x: (x / x.sum())*100,axis=1)
HTML(cluster_perc.to_html())
#calculate avg
temp = pd.merge(left=products,
right=order_products.product_id.value_counts().to_frame('count'),
left_index=True, right_index=True)
temp = pd.merge(left=temp,
right=pd.DataFrame(order_products.groupby('product_id').reordered.sum().to_frame(), dtype='int64'),
left_index=True, right_index=True)
temp['reorder_rate'] = temp['reordered']/temp['count']
temp = pd.merge(left=temp,
right=order_products.groupby('product_id').add_to_cart_order.mean().to_frame('add_to_cart_mean'),
left_index=True, right_index=True)
temp = pd.merge(left=temp,
right=pd.merge(left=order_products,
right=orders[['order_dow', 'order_hour_of_day', 'days_since_prior_order']],
left_on='order_id', right_index=True).groupby('product_id').order_dow.mean().to_frame(),
left_index=True, right_index=True)
temp = pd.merge(left=temp,
right=pd.merge(left=order_products,
right=orders[['order_dow', 'order_hour_of_day', 'days_since_prior_order']],
left_on='order_id', right_index=True).groupby('product_id').order_hour_of_day.mean().to_frame(),
left_index=True, right_index=True)
temp = pd.merge(left=temp,
right=pd.merge(left=order_products,
right=orders[['order_dow', 'order_hour_of_day', 'days_since_prior_order']],
left_on='order_id', right_index=True).groupby('product_id').days_since_prior_order.mean().to_frame(),
left_index=True, right_index=True)
display(temp.head())
temp.shape
#Scaling with StandardScaler
temp.drop(['product_name', 'department_id', 'reordered'], axis=1, inplace=True)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
temp_scaled = scaler.fit_transform(temp)
def fancy_dendrogram(*args, **kwargs):
max_d = kwargs.pop('max_d', None)
if max_d and 'color_threshold' not in kwargs:
kwargs['color_threshold'] = max_d
annotate_above = kwargs.pop('annotate_above', 0)
plt.figure(figsize=(15,10))
ddata = dendrogram(*args, **kwargs)
if not kwargs.get('no_plot', False):
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index or (cluster size)')
plt.ylabel('distance')
for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
x = 0.5 * sum(i[1:3])
y = d[1]
if y > annotate_above:
plt.plot(x, y, 'o', c=c)
plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
textcoords='offset points',
va='top', ha='center')
if max_d:
plt.axhline(y=max_d, c='k')
return ddata
from scipy.cluster.hierarchy import dendrogram, ward
linked_array = ward(temp_scaled)
fancy_dendrogram(
linked_array,
truncate_mode='lastp',
p=30,
leaf_rotation=90.,
leaf_font_size=12.,
show_contracted=True,
annotate_above=10,
max_d=12.5
)
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
from scipy.cluster.hierarchy import fcluster
max_d = 12.5
clusters = fcluster(linked_array, max_d, criterion='distance')
labels, counts = np.unique(clusters, return_counts=True)
temp['clusters'] = clusters
print('reorder rates for each cluster\n')
for i in range(1,len(np.unique(clusters))+1):
print('\nlabel: {}'.format(i))
print('n: {}'.format(counts[i-1]))
print('rr: {}'.format(round(temp[temp['clusters'] == i].reorder_rate.mean()*100, 2)))
merged_df = pd.DataFrame()
for i in range(1,4):
test = pd.DataFrame(temp[temp['clusters'] == i].mean())
test = test.T.set_index('clusters', drop = True)
test['size'] = temp[temp['clusters'] == i].shape[0]
merged_df = pd.concat([merged_df, test])
merged_df.T.round(2).drop('product_id')
df = pd.merge(order_products, products, how='left', on='product_id').drop(["product_id", "department_id","add_to_cart_order", "reordered"], axis=1)
# Define dataset to machine learning
market_basket = pd.pivot_table(df, index='order_id', columns='product_name',aggfunc=lambda x: 1 if len(x)>0 else 0).fillna(0)
market_basket.head()
Support:
refers to the default popularity of an item and can be calculated by finding number of transactions containing a particular item divided by total number of transactions
Confidence:
refers to the likelihood that an item B is also bought if item A is bought. It can be calculated by finding the number of transactions where A and B are bought together, divided by total number of transactions where A is bought
Lift:
refers to the increase in the ratio of sale of B when A is sold. Lift(A –> B) can be calculated by dividing Confidence(A -> B) divided by Support(B)
Leverage:
computes the difference between the observed frequency of A and C appearing together and the frequency that would be expected if A and C were independent
Conviction:
A high conviction value means that the consequent is highly depending on the antecedent
# Apriori method request a min_support: Support is defined as the percentage of time that an itemset appears in the dataset.
# Defined to start seeing data/results with min_support of 5%
itemsets = apriori(market_basket, min_support= 0.05, use_colnames=True)
# Build your association rules using the mxltend association_rules function.
# min_threshold can be thought of as the level of confidence percentage that you want to return
# Defined to use 50% of min_threshold
rulesConfidence = association_rules(itemsets, metric='confidence', min_threshold=0.6)
rulesConfidence.sort_values(by='confidence', ascending=False, inplace=True)
rulesConfidence.head()
# Generate the association rules - by lift
rulesLift = association_rules(itemsets, metric="lift", min_threshold=1.6)
rulesLift.sort_values(by='lift', ascending=False, inplace=True)
rulesLift.head()
support = rulesConfidence.support.to_numpy()
confidence = rulesConfidence.confidence.to_numpy()
for i in range (len(support)):
support[i] = support[i]
confidence[i] = confidence[i]
plt.figure(figsize=(8,6))
plt.title('Assonciation Rules')
plt.xlabel('support')
plt.ylabel('confidance')
sns.regplot(x=support, y=confidence, fit_reg=False, color = '#ff8200')
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
# Recommendation of Market Basket
rec_rules = rulesConfidence[ (rulesConfidence['lift'] > 1.7) & (rulesConfidence['confidence'] >= 0.7) ]
# Recommendation of Market Basket Dataset
cols_keep = {'antecedents':'antecedents', 'consequents':'consequents', 'support':'support', 'confidence':'confidence', 'lift':'lift'}
cols_drop = ['antecedent support', 'consequent support', 'leverage', 'conviction']
recommendation_basket = pd.DataFrame(rec_rules).rename(columns= cols_keep).drop(columns=cols_drop).sort_values(by=['lift'], ascending = False)
display(recommendation_basket)
rulesConfidence['lhs items'] = rulesConfidence['antecedents'].apply(lambda x:len(x))
# Replace frozen sets with strings
rulesConfidence['antecedents_'] = rulesConfidence['antecedents'].apply(lambda a: ','.join(list(a)))
rulesConfidence['consequents_'] = rulesConfidence['consequents'].apply(lambda a: ','.join(list(a)))
# Transform the DataFrame of rulesConfidence into a matrix using the lift metric
pivot = rulesConfidence[rulesConfidence['lhs items']>1].pivot(index = 'antecedents_',
columns = 'consequents_', values= 'lift')
# Generate a heatmap with annotations on and the colorbar off
plt.figure(figsize=(10,6))
sns.heatmap(pivot, annot = True,cmap="Greens")
plt.yticks(rotation=0)
plt.xticks(rotation=90)
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
top_rules = rulesConfidence.sort_values('confidence', ascending = False)[:10]
top_rules
def draw_graph(rules, rules_to_show):
import networkx as nx
G1 = nx.DiGraph()
color_map=[]
N = 50
colors = np.random.rand(N)
strs=['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11']
for i in range (rules_to_show):
G1.add_nodes_from(["R"+str(i)])
for a in rules.iloc[i]['antecedents']:
G1.add_nodes_from([a])
G1.add_edge(a, "R"+str(i), color=colors[i] , weight = 2)
for c in rules.iloc[i]['consequents']:
G1.add_nodes_from([c])
G1.add_edge("R"+str(i), c, color=colors[i], weight=2)
for node in G1:
found_a_string = False
for item in strs:
if node==item:
found_a_string = True
if found_a_string:
color_map.append('#ff8200')
else:
color_map.append('#43b02a')
edges = G1.edges()
colors = [G1[u][v]['color'] for u,v in edges]
weights = [G1[u][v]['weight'] for u,v in edges]
pos = nx.spring_layout(G1, k=16, scale=1, seed=1)
nx.draw(G1, pos, edgelist =edges, node_color = color_map, edge_color=colors, width=weights, font_size=16, with_labels=False)
for p in pos: # raise text positions
pos[p][1] += 0.07
nx.draw_networkx_labels(G1, pos)
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
draw_graph (rulesConfidence, 10)
itemsets
frequent_itemsets=itemsets.copy()
# Add a column with the length
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
# Length=2 and Support>=0.2
frequent_itemsets[(frequent_itemsets['length'] == 2) & (frequent_itemsets['support'] >= 0.5)]
# Substitute products
rulesLift2 = association_rules(frequent_itemsets, metric="lift", min_threshold=0.0)
rulesLift2.sort_values(by='lift', ascending=True, inplace=True)
rulesLift2.head(20)
# Generate the association rules - by confidence
rulesConfidence = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.50)
### Plot a basic network graph of the top 50 confidence rules
# Create a copy of the rules and transform the frozensets to strings
rulesToPlot = rulesConfidence.copy(deep=True)
rulesToPlot['LHS'] = [','.join(list(x)) for x in rulesToPlot['antecedents']]
rulesToPlot['RHS'] = [','.join(list(x)) for x in rulesToPlot['consequents']]
# Remove duplicate if reversed rules
rulesToPlot['sortedRow'] = [sorted([a,b]) for a,b in zip(rulesToPlot.LHS, rulesToPlot.RHS)]
rulesToPlot['sortedRow'] = rulesToPlot['sortedRow'].astype(str)
rulesToPlot.drop_duplicates(subset=['sortedRow'], inplace=True)
rulesToPlot
# Plot
rulesToPlot=rulesToPlot[:50]
fig = plt.figure(figsize=(20, 20))
G = nx.from_pandas_edgelist(rulesToPlot, 'LHS', 'RHS')
my_pos = nx.spring_layout(G, seed = 1) #for reproducibility
nx.draw(G, pos=my_pos, with_labels=True, edge_color = '#43b02a', node_size=40, node_color="#ff8200", font_color = 'black', font_size=20)
plt.axis('equal')
plt.show()
# High Confidence and high Lift - complementary products, parameter 1
rulesConfidence[(rulesConfidence['confidence'] >= 0.8) & (rulesConfidence['lift'] >= 1.8)]
# Complement
# High Confidence and high Lift - complementary products, parameter 2
rulesConfidence[(rulesConfidence['confidence'] >= 0.7) & (rulesConfidence['lift'] >= 1.7)]
# Complement
# High Confidence and high Lift - complementary products, parameter 3
rulesConfidence[(rulesConfidence['confidence'] >= 0.6) & (rulesConfidence['lift'] >= 1.6)]
first_order = orders[orders['days_since_prior_order'].isnull()]
first_order_products=pd.merge(first_order,order_products, on='order_id',how='left')
# Rank the top 10 best-selling items
grouped = first_order_products.groupby("product_id")["reordered"].agg(frequency_count = 'count').reset_index()
grouped = pd.merge(grouped, products[['product_id', 'product_name']], how='left', on=['product_id'])
percent = grouped.product_name.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
grouped = grouped.sort_values(by= 'frequency_count', ascending=False)[:10]
grouped[['product_name','frequency_count']].reset_index(drop=True)
# we can see difference for the 1st and 2nd place only place
# let's check department
items = pd.merge(left=products, right=departments, how='left')
users_flow_first = first_order_products.merge(items, how='inner', left_on='product_id', right_on='product_id')
grouped = users_flow_first.groupby("department")["order_id"].agg( Total_orders = 'count').reset_index()
grouped['Ratio'] = round(grouped["Total_orders"].apply(lambda x: x /grouped['Total_orders'].sum())*100,2)
grouped.sort_values(by='Total_orders', ascending=False, inplace=True)
grouped.reset_index(drop=True)[:10]
# difference 6-9
first_order = orders.loc[orders['days_since_prior_order'].isnull(),'order_id']
first_order_products=pd.merge(first_order,order_products, on='order_id',how='left')
first_order_df = pd.merge(first_order_products, products, how='left', on='product_id').drop(["product_id", "department_id","add_to_cart_order", "reordered"], axis=1)
# Define dataset to machine learning
market_basket_first = pd.pivot_table(first_order_df, index='order_id', columns='product_name',aggfunc=lambda x: 1 if len(x)>0 else 0).fillna(0)
market_basket_first.head()
# Apriori method request a min_support: Support is defined as the percentage of time that an itemset appears in the dataset.
# Defined to start seeing data/results with min_support of 5%
itemsets = apriori(market_basket_first, min_support= 0.05, use_colnames=True)
# Build your association rules using the mxltend association_rules function.
# Defined to use confidence level of 0.6 of min_threshold
rules = association_rules(itemsets, metric='confidence', min_threshold=0.6)
# Below the list of products sales combinations
# It can use this information to build a cross-sell recommendation system that promotes these products with each other
rules.sort_values("lift", ascending = False, inplace = True)
rules.head(10)
top_rules = rules.sort_values('confidence', ascending = False)[:10]
top_rules
frequent_itemsets=itemsets.copy()
##### EXPLORE FREQUENT_ITEMSETS #####
# Add a column with the length
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
# Length=2 and Support>=0.2
frequent_itemsets[(frequent_itemsets['length'] == 2) & (frequent_itemsets['support'] >= 0.1)]
# Generate the association rules - by confidence
rulesConfidence = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.60)
### Plot a basic network graph of the top 50 confidence rules
# Create a copy of the rules and transform the frozensets to strings
rulesToPlot = rulesConfidence.copy(deep=True)
rulesToPlot['LHS'] = [','.join(list(x)) for x in rulesToPlot['antecedents']]
rulesToPlot['RHS'] = [','.join(list(x)) for x in rulesToPlot['consequents']]
# Remove duplicate if reversed rules
rulesToPlot['sortedRow'] = [sorted([a,b]) for a,b in zip(rulesToPlot.LHS, rulesToPlot.RHS)]
rulesToPlot['sortedRow'] = rulesToPlot['sortedRow'].astype(str)
rulesToPlot.drop_duplicates(subset=['sortedRow'], inplace=True)
rulesToPlot
# Plot
rulesToPlot=rulesToPlot[:50]
fig = plt.figure(figsize=(20, 20))
G = nx.from_pandas_edgelist(rulesToPlot, 'LHS', 'RHS')
my_pos = nx.spring_layout(G, seed = 1) #for reproducibility
nx.draw(G, pos=my_pos, with_labels=True, edge_color = '#43b02a', node_size=40, node_color="#ff8200", font_color = 'black', font_size=20)
plt.axis('equal')
plt.show()
atco = user_order_products_all_details[user_order_products_all_details['add_to_cart_order'] <= 15]
atco = pd.merge(atco,departments, how='left',on='department_id')
grouped = atco.groupby("department")["order_id"].agg( Total_orders = 'count').reset_index()
grouped['Ratio'] = round(grouped["Total_orders"].apply(lambda x: x /grouped['Total_orders'].sum())*100,2)
grouped.sort_values(by='Total_orders', ascending=False, inplace=True)
grouped.reset_index(drop=True)[:10]
atco = atco[['order_id','product_name']]
# Define dataset to machine learning
market_basket_atco = pd.pivot_table(atco, index='order_id', columns='product_name',aggfunc=lambda x: 1 if len(x)>0 else 0).fillna(0)
market_basket_atco.head()
# Apriori method request a min_support: Support is defined as the percentage of time that an itemset appears in the dataset.
# Defined to start seeing data/results with min_support of 2%
itemsets = apriori(market_basket_first, min_support= 0.05, use_colnames=True)
# Build your association rules using the mxltend association_rules function.
# min_threshold can be thought of as the level of confidence percentage that you want to return
# Defined to use 50% of min_threshold
rules = association_rules(itemsets, metric='confidence', min_threshold=0.6)
# Below the list of products sales combinations
# It can use this information to build a cross-sell recommendation system that promotes these products with each other
rules.sort_values("lift", ascending = False, inplace = True)
rules.head(10)
top_rules = rules.sort_values('confidence', ascending = False)[:10]
top_rules
frequent_itemsets=itemsets.copy()
##### EXPLORE FREQUENT_ITEMSETS #####
# Add a column with the length
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
# Length=2 and Support>=0.2
frequent_itemsets[(frequent_itemsets['length'] == 2) & (frequent_itemsets['support'] >= 0.1)]
# Generate the association rules - by confidence
rulesConfidence = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
### Plot a basic network graph of the top 50 confidence rules
# Create a copy of the rules and transform the frozensets to strings
rulesToPlot = rulesConfidence.copy(deep=True)
rulesToPlot['LHS'] = [','.join(list(x)) for x in rulesToPlot['antecedents']]
rulesToPlot['RHS'] = [','.join(list(x)) for x in rulesToPlot['consequents']]
# Remove duplicate if reversed rules
rulesToPlot['sortedRow'] = [sorted([a,b]) for a,b in zip(rulesToPlot.LHS, rulesToPlot.RHS)]
rulesToPlot['sortedRow'] = rulesToPlot['sortedRow'].astype(str)
rulesToPlot.drop_duplicates(subset=['sortedRow'], inplace=True)
rulesToPlot
# Plot
rulesToPlot=rulesToPlot[:20]
fig = plt.figure(figsize=(20, 20))
G = nx.from_pandas_edgelist(rulesToPlot, 'LHS', 'RHS')
my_pos = nx.spring_layout(G, seed = 1) #for reproducibility
nx.draw(G, pos=my_pos, with_labels=True, edge_color = '#43b02a', node_size=40, node_color="#ff8200", font_color = 'black', font_size=20)
plt.axis('equal')
plt.show()
limitation:
days since prior order = 0...means the individual ordered twice in the same day. Could we consider this as one basket? The individual forgot to order other items needed? We can't explore this because we only have a sample, don't have the data for the earlier order made that day.
data2 = pd.merge(user_order_products_all_details, departments, on='department_id')
# We plan to conduct the days of the week analysis then hours of the day. However, instead of hours of the day we
# tranform it to 4 categorical values--times of the day:
# Morning (6 to 11, since 11:59 is still stamped as hour 11), Afternoon (12 to 16), evening (17 to 20), night (21 to 5).
# The exact hours of the times of the day is subjective and seasonaly changing. We will stick to the above for simplicity.
print(sum((data2['order_hour_of_day']>=6) & (data2['order_hour_of_day']<12)))
print(sum((data2['order_hour_of_day']>=12) & (data2['order_hour_of_day']<17)))
print(sum((data2['order_hour_of_day']>=17) & (data2['order_hour_of_day']<21)))
# night:
print(sum(data2['order_hour_of_day']>=21))
print(sum(data2['order_hour_of_day']<6))
print((sum(data2['order_hour_of_day']>=21))+(sum(data2['order_hour_of_day']<6)))
#Morning = 0, Afternoon = 1, Evening = 2, Night = 3
# tofd = time of the day
data2['tofd'] = data2['order_hour_of_day']
data2.loc[(data2['order_hour_of_day']>=6) & (data2['order_hour_of_day']<12), 'tofd'] = 0
data2.loc[(data2['order_hour_of_day']>=12) & (data2['order_hour_of_day']<17), 'tofd'] = 1
data2.loc[(data2['order_hour_of_day']>=17) & (data2['order_hour_of_day']<21), 'tofd'] = 2
# night
data2.loc[data2['order_hour_of_day']>=21, 'tofd'] = 3
data2.loc[data2['order_hour_of_day']<6, 'tofd'] = 3
#Check
print(sum(data2['tofd']==0))
print(sum(data2['tofd']==1))
print(sum(data2['tofd']==2))
print(sum(data2['tofd']==3))
data2['tofd'].value_counts(sort=False).plot(kind='bar', color = '#43b02a')
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
# Data split per dow, saved in a list.
dow_range = [0,1,2,3,4,5,6]
df_dow_list=[]
for i in dow_range:
df_input = data2[(data2['order_dow']==i)]
df_dow_list.append(df_input)
Day index from the list.
0 = Sun
1 = Mon
2 = Tue
3 = Wed
4 = Thu
5 = Fri
6 = Sat
#example to retrieve Sunday data
df_dow_list[0]
# Data split per dow (days of the week) per tofd (time of the day), saved in a list.
dow_range = [0,1,2,3,4,5,6]
tofd_range = [0,1,2,3]
df_tofd_list=[]
for i in dow_range:
for j in tofd_range:
df_input = data2[(data2['order_dow']==i) & (data2['tofd']==j)]
df_tofd_list.append(df_input)
df_tofd_list[0]
The dfs_list can be accessed:
Day range: 0 = Sunday...6 = Monday
Time range: 0 = Morning...3 = Night
Day : Time
0 : 0 = df_tofd_list[0]
0 : 1 = df_tofd_list[1]
0 : 2 = df_tofd_list[2]
0 : 3 = df_tofd_list[3]
1 : 0 = df_tofd_list[4]
1 : 1 = df_tofd_list[5]
1 : 2 = df_tofd_list[6]
1 : 3 = df_tofd_list[7]
...(continues)...
6 : 3 = df_tofd_list[27]
Notice the multiples of 4 is a new day
For easier reference:
0 = Sun
4 = Mon
8 = Tue
12 = Wed
16 = Thu
20 = Fri
24 = Sat
df_output = pd.DataFrame()
for i in range(len(df_tofd_list)):
df_output = df_output.append(pd.DataFrame([i,len(df_tofd_list[i])]).T)
# print('Index {}: {}'.format(i,len(df_tofd_list[i])))
df_output.reset_index(inplace=True,drop=True)
df_output
# plotting frequency of orders over day and time (i.e. morning, afternoon, evening, night)
# example: 3 is Sunday night
plt.plot(df_output[0], df_output[1], color='#ff8200')
plt.title('Distribution',fontsize=14)
plt.xlabel('day-time identifier',fontsize=14)
plt.ylabel('order count',fontsize=14)
plt.rcParams['figure.figsize'] = [10, 5]
plt.rcParams['figure.dpi'] = 100
ax = plt.gca()
ax.set_facecolor('#fff0db')
plt.show()
# dfl = list of data frames (e.g., df_tofd_list)
# ms = apriori min_support value, 0.05 by default
# cmt = confidence min_threshold, 0.5 by default
# lmt = lift min_threshold, 1.5 by default
# cmptc = complement: min_threshold for confidence
# cmptl = complement: min_threshold for lift
# cmptl_c = complement: ceiling for lift (e.g, cmptl_c=1.5, max lift value is 1.5)
#subsmt = substitutes: min_threshold for lift
def ACL_per_split(dfl, ms=0.05, cmt=0.50, lmt=1.5, cmptc=0.8, cmptl=1.8, cmptl_c=99, subsmt=0.0):
cmptl_c=cmptl_c
filtdf_list=[]
pivot_list=[]
apriori_list=[]
rConf_list=[] # List of association dfs by confidence
rLift_list=[] # List of association dfs by lift
cmpt_list=[] # List of potential complements dfs
subs_list=[] # List of potential subs dfs
for i in range(len(dfl)):
df_input = dfl[i][['order_id','product_name']]
filtdf_list.append(df_input)
# Pivot the data - lines as orders and products as columns
pt_input = pd.pivot_table(df_input, index='order_id', columns='product_name',
aggfunc=lambda x: 1 if len(x)>0 else 0).fillna(0)
pivot_list.append(pt_input)
# Apriori
freq_sets_input = apriori(pt_input, min_support=ms, use_colnames=True)
apriori_list.append(freq_sets_input)
# Association - by confidence
rConf_input = association_rules(freq_sets_input, metric="confidence", min_threshold=cmt)
rConf_input.sort_values(by='confidence', ascending=False, inplace=True)
rConf_list.append(rConf_input)
# Association - by lift
rLift_input = association_rules(freq_sets_input, metric="lift", min_threshold=lmt)
rLift_input.sort_values(by='lift', ascending=False, inplace=True)
rLift_list.append(rLift_input)
# High confidence and high lift - complementary products
if cmptl_c != 99:
cmpt_input = rConf_input[(rConf_input['confidence']>=cmptc)&(rConf_input['lift']>=cmptl)&(rConf_input['lift']<=cmptl_c)]
else:
cmpt_input = rConf_input[(rConf_input['confidence']>=cmptc)&(rConf_input['lift']>=cmptl)]
cmpt_list.append(cmpt_input)
# Subtitute products
subs_input = association_rules(freq_sets_input, metric = 'lift',min_threshold=subsmt)
subs_input.sort_values(by='lift', ascending=True, inplace=True)
subs_list.append(subs_input)
return rConf_list, rLift_list, cmpt_list, subs_list
pd.options.display.max_rows = 200 # 200 max rows for data inspection
rConf_list_dow, rLift_list_dow, cmpt_list_dow, subs_list_dow = ACL_per_split(df_dow_list)
df_subs_dow = pd.DataFrame()
for i in range(0,len(subs_list_dow)):
df_insert = subs_list_dow[i].iloc[:5,:][["antecedents","consequents","lift",'confidence','support']]
df_insert["index"] = i
df_subs_dow = df_subs_dow.append(df_insert)
df_subs_dow.sort_values(by=['index','support'], ascending=[True,False])
lift values higher than 1 in many cases. This is not a problem per se, since people still buy substitutes together
(e.g. pretzels can be a substitute for mini carrots, but you still buy them together often...even more so in this case because "fresh vegetables" include such a wide range of products)
But still, need to play with different parameters for ACL_per_split(), especially the ms value,
and maybe a restriction on max lift value.
df_cmpt_dow = pd.DataFrame()
for i in range(0,len(cmpt_list_dow)):
df_insert = cmpt_list_dow[i].iloc[:5,:][["antecedents","consequents","lift",'confidence','support']]
df_insert["index"] = i
df_cmpt_dow = df_cmpt_dow.append(df_insert)
df_cmpt_dow.sort_values(by=['index','support'], ascending=[True,False])
6.3.1.2 Parameter Adjustment for Substitutes
rConf_list_dow2, rLift_list_dow2, cmpt_list_dow2, subs_list_dow2 = ACL_per_split(df_dow_list, ms=0.04, cmptc=0.7, cmptl=1.7)
# Ranked by lift
df_subs_dow2 = pd.DataFrame()
for i in range(0,len(subs_list_dow2)):
df_insert = subs_list_dow2[i].iloc[:20,:][["antecedents","consequents","lift",'support']]
df_insert["index"] = i
df_subs_dow2 = df_subs_dow2.append(df_insert)
df_subs_dow2.iloc[::2] #select even (or odd) number index value to exclude opposite direction duplicates.(Note: This should NOT be done for complementary products as the direction matters a lot more than substitutes)
# Lowest 10 Lift values selected, then ranked by Support
df_subs_dow2 = pd.DataFrame()
for i in range(0,len(subs_list_dow2)):
df_insert = subs_list_dow2[i].iloc[:20,:][["antecedents","consequents","lift",'support']]
df_insert["index"] = i
df_subs_dow2 = df_subs_dow2.append(df_insert)
# Select even (or odd) number index value to exclude opposite direction duplicates, then rank by support.
dfs_subs_final = df_subs_dow2.iloc[::2].sort_values(by=['index','support'], ascending=[True,False])
dfs_subs_final.reset_index(drop=True, inplace=True)
# dfs_subs_final.to_excel('dfs_subs_final.xlsx') # save for qualitative discussion
#### 6.3.1.3 Parameter Adjustment and Dominant Element Exclusion for Complements
# Ranked by confidence
for i in range(len(cmpt_list_dow2)):
cmpt_list_dow2[i].sort_values(by='confidence', ascending=False, inplace=True)
df_cmpt_dow2 = pd.DataFrame()
for i in range(0,len(cmpt_list_dow2)):
df_insert = cmpt_list_dow2[i].iloc[:30,:][["antecedents","consequents","lift",'confidence','support']]
df_insert["index"] = i
df_cmpt_dow2 = df_cmpt_dow2.append(df_insert)
df_cmpt_dow2.sort_values(by=['index','support'], ascending=[True,False])
df = df_dow_list
df[0]
Complementary by exclusion, 1st element (fresh vegetables)
df = df_dow_list
dfs_filt=[]
for i in range(len(df)):
novege_input = df[i][~df[i].product_name.str.contains("fresh vegetables")]
dfs_filt.append(novege_input)
rConf_list_filt, rLift_list_filt, cmpt_list_filt, subs_list_filt = ACL_per_split(dfs_filt, ms=0.03, cmptc=0.8, cmptl=1.2)
# Ranked by confidence
for i in range(len(cmpt_list_filt)):
cmpt_list_filt[i].sort_values(by='confidence', ascending=False, inplace=True)
df_cmpt_dow2 = pd.DataFrame()
for i in range(0,len(cmpt_list_filt)):
df_insert = cmpt_list_filt[i].iloc[:10,:][["antecedents","consequents","lift",'confidence','support']]
df_insert["index"] = i
df_cmpt_dow2 = df_cmpt_dow2.append(df_insert)
df_cmpt_dow2.sort_values(by=['index','support'], ascending=[True,False])
2nd element (fresh fruits) exclusion
df = df_dow_list
dfs_filt=[]
for i in range(len(df)):
novege_input = df[i][~df[i].product_name.str.contains("fresh vegetables")]
novege_input2 = novege_input[~novege_input.product_name.str.contains("fresh fruits")]
dfs_filt.append(novege_input2)
rConf_list_filt, rLift_list_filt, cmpt_list_filt, subs_list_filt = ACL_per_split(dfs_filt, ms=0.03, cmptc=0.65, cmptl=1.2)
# Ranked by confidence
for i in range(len(cmpt_list_filt)):
cmpt_list_filt[i].sort_values(by='confidence', ascending=False, inplace=True)
df_cmpt_dow2 = pd.DataFrame()
for i in range(0,len(cmpt_list_filt)):
df_insert = cmpt_list_filt[i].iloc[:10,:][["antecedents","consequents","lift",'confidence','support']]
df_insert["index"] = i
df_cmpt_dow2 = df_cmpt_dow2.append(df_insert)
df_cmpt_dow2.sort_values(by=['index','support'], ascending=[True,False])
3rd element (packaged vegetables fruits) exclusion
df = df_dow_list
dfs_filt=[]
for i in range(len(df)):
novege_input = df[i][~df[i].product_name.str.contains("fresh vegetables")]
novege_input2 = novege_input[~novege_input.product_name.str.contains("fresh fruits")]
novege_input3 = novege_input2[~novege_input2.product_name.str.contains("packaged vegetables fruits")]
dfs_filt.append(novege_input3)
rConf_list_filt, rLift_list_filt, cmpt_list_filt, subs_list_filt = ACL_per_split(dfs_filt, ms=0.02, cmptc=0.5, cmptl=1.1)
# Ranked by confidence
for i in range(len(cmpt_list_filt)):
cmpt_list_filt[i].sort_values(by='confidence', ascending=False, inplace=True)
df_cmpt_dow2 = pd.DataFrame()
for i in range(0,len(cmpt_list_filt)):
df_insert = cmpt_list_filt[i].iloc[:10,:][["antecedents","consequents","lift",'confidence','support']]
df_insert["index"] = i
df_cmpt_dow2 = df_cmpt_dow2.append(df_insert)
df_cmpt_dow2.sort_values(by=['index','support'], ascending=[True,False])
Consideration:
### Substitutes mapping
dfs_subs_final
dfs_subs_selected = dfs_subs_final.drop(dfs_subs_final.index[[0,7,10,11,12,13,15,17,19,20,21,22,23,26,27,29,30,31,32,34,36,39,40,41,42,44,45,48,49,50,51,52,54,55,59,60,64,68]])
dfs_subs_selected
mapping_dow = pd.read_csv('Mapping_dow.csv')
df_dow = pd.merge(dfs_subs_selected,mapping_dow, how='left')
df1_dow = pd.merge(df_cmpt_dow,mapping_dow, how='left')
df_dow = df_dow[~(df_dow.antecedents.str.len() > 1)]
df_dow['antecedents'] = [list(x)[0] for x in df_dow['antecedents']]
df_dow['consequents'] = [list(x)[0] for x in df_dow['consequents']]
df1_dow.shape
df1_dow = df1_dow[~(df1_dow.antecedents.str.len() > 1)]
df1_dow['antecedents'] = [list(x)[0] for x in df1_dow['antecedents']]
df1_dow['consequents'] = [list(x)[0] for x in df1_dow['consequents']]
df1_dow
fig = px.parallel_categories(df_dow, dimensions=['Day','antecedents','consequents'],
color="support", color_continuous_scale=px.colors.sequential.Inferno,
labels={'Day':'Day','antecedents':'antecedents','consequents':'consequents'})#'antecedents':'antecedents'})
fig.update_layout(legend = dict(bgcolor = '#fff0db'))
fig.show()
df_dow
rConf_list_tofd, rLift_list_tofd, cmpt_list_tofd, subs_list_tofd = ACL_per_split(df_tofd_list, ms=0.04, cmptc=0.7, cmptl=1.7)
# Lowest 10 Lift values selected, then ranked by Support
df_subs_tofd = pd.DataFrame()
for i in range(0,len(subs_list_tofd)):
df_insert = subs_list_tofd[i].iloc[:10,:][["antecedents","consequents","lift",'support']]
df_insert["index"] = i
df_subs_tofd = df_subs_tofd.append(df_insert)
# Select even (or odd) number index value to exclude opposite direction duplicates, then rank by support.
dfs_subs_toft_final = df_subs_tofd.iloc[::2].sort_values(by=['index','support'], ascending=[True,False])
df_cmpt_tofd = pd.DataFrame()
for i in range(0,len(cmpt_list_tofd)):
df_insert = cmpt_list_tofd[i].iloc[:5,:][["antecedents","consequents","lift"]]
df_insert["index"] = i
df_cmpt_tofd = df_cmpt_tofd.append(df_insert)
df_cmpt_tofd
# some days don't have products that meet the criteria for complements parameter, hence the index is missing for those.
mapping = pd.read_csv('Mapping.csv')
df = pd.merge(df_subs_tofd,mapping, how='left')
df1 = pd.merge(df_cmpt_tofd,mapping, how='left')
df = df[~(df.antecedents.str.len() > 1)]
df['antecedents'] = [list(x)[0] for x in df['antecedents']]
df['consequents'] = [list(x)[0] for x in df['consequents']]
df1.shape
df1 = df1[~(df1.antecedents.str.len() > 1)]
df1['antecedents'] = [list(x)[0] for x in df1['antecedents']]
df1['consequents'] = [list(x)[0] for x in df1['consequents']]
fig = px.parallel_categories(df, dimensions=['Day', 'Time','antecedents','consequents'],
color="lift", color_continuous_scale=px.colors.sequential.Inferno,
labels={'Day':'Day', 'Time':'Time','antecedents':'antecedents','consequents':'consequents'})#'antecedents':'antecedents'})
fig.update_layout(legend = dict(bgcolor = '#fff0db'))
fig.show()
def Recommender_System(user_id):
'''
enter user_id and return a list of 5 recommendations.
'''
u = high_volume.groupby(['user_id','product_name']).size().sort_values(ascending=False).unstack().fillna(0)
u_sim = pd.DataFrame(cosine_similarity(u), index=u.index, columns=u.index)
p = high_volume.groupby(['product_name','user_id']).size().sort_values(ascending=False).unstack().fillna(0)
recommendations = pd.Series(np.dot(p.values,cosine_dists[user_id]), index=p.index)
return recommendations.sort_values(ascending=False).head()
# get the list of orders that have been reordered before
reorders = order_products[order_products['reordered'] == 1]
orders2 = orders[['order_id', 'user_id']]
reorders = pd.merge(reorders, products[['product_id', 'product_name']], how='left', on=['product_id'])
# merge to get user_id and product_id
user_orders = reorders.merge(orders2, on='order_id')
# filtering out the high volumn products that user reordered more than once
user_orders['high_volume'] = (user_orders['product_id'].value_counts().sort_values(ascending=False)>1)
high_volume = user_orders[user_orders['high_volume'] == True]
# get a matrix of different high volume items that particular user purchased
high_volume_users = high_volume.groupby(['user_id', 'product_name']).size().sort_values(ascending=False).unstack().fillna(0)
# calculate similarity between each user
cosine_dists = pd.DataFrame(cosine_similarity(high_volume_users),index=high_volume_users.index, columns=high_volume_users.index)
cosine_dists.head()
# recommendation for customer id=30618
Recommender_System(30618)